{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## SIFTS_unit 流程\n",
    "> Last modified time: 2019-08-17T12:13:57+08:00 By ZeFeng zhu"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import pandas as pd\n",
    "sys.path.append('../py_src/')\n",
    "from SIFTS_unit import SIFTS_unit\n",
    "from UniProt_unit import UniProt_unit"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Build Complete Basic DataSet"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```py\n",
    "raw_sifts_file_path = '../../data/pdb_uniprot_SIFTS_0714.csv'\n",
    "add_rangeInfo_sifts_file_path = '../../data/pdb_uniprot_SIFTS_NEW0815.tsv'\n",
    "add_InDe_sifts_file_path = '../../data/Mapping_Pipeline/sifts_files/pdb_uniprot_SIFTS_delwithInDe_0815.tsv'\n",
    "unp_len_file_path = '../../data/Mapping_Pipeline/unp_len_list_0817.tsv'\n",
    "seg_edited_file_path = '../../data/Mapping_Pipeline/sifts_files/uniprot_segments_observed_edited_0815.tsv'\n",
    "add_unpLen_segInfo_file_path = '../../data/Mapping_Pipeline/sifts_files/???'\n",
    "\n",
    "sifts_demo = SIFTS_unit()\n",
    "info_dict = sifts_demo.get_info_from_uniprot_pdb_file() # Download a new file\n",
    "sifts_demo.set_lists(info_dict['pdb_set'], [])\n",
    "\n",
    "fail_list = sifts_demo.get_raw_SIFTS(raw_sifts_file_path)\n",
    "sifts_df_1 = sifts_demo.handle_SIFTS(outputPath=add_rangeInfo_sifts_file_path)\n",
    "sifts_df_2 = sifts_demo.deal_with_insertionDeletion_SIFTS(sifts_df=sifts_df_1, outputPath=add_InDe_sifts_file_path)\n",
    "\n",
    "# Add UniProt Length Info\n",
    "uniprot_demo = UniProt_unit()\n",
    "uniprot_demo.get_info_from_uniprot(['id', 'length'], unp_len_file_path, unp_list=list(info_dict['unp_set']))\n",
    "unp_len_df = pd.read_csv(unp_len_file_path, sep='\\t', usecols=['Entry', 'Length'])\n",
    "sifts_df_3 = sifts_demo.add_unp_len_SIFTS(sifts_df=sifts_df_2, unpLen_df=unp_len_df)\n",
    "\n",
    "# Add Segment Info\n",
    "seg_df = sifts_demo.get_seg_info_from_uniprot_segments_file(outputPath=seg_edited_file_path) # Download a new file\n",
    "sifts_demo.add_seg_info_to_SIFTS(sifts_df=sifts_df_3, seg_df=seg_df, outputPath=add_unpLen_segInfo_file_path)\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Update Basic DataSet"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```py\n",
    "demo_pdb_list_path = '../../data/demo_pdb_list.tsv'\n",
    "raw_sifts_file_path = '../../data/pdb_uniprot_SIFTS_demo.csv'\n",
    "add_rangeInfo_sifts_file_path = '../../data/pdb_uniprot_SIFTS_NEW0815.tsv'\n",
    "add_InDe_sifts_file_path = '../../data/Mapping_Pipeline/sifts_files/pdb_uniprot_SIFTS_delwithInDe_0815.tsv'\n",
    "unp_len_file_path = '../../data/Mapping_Pipeline/unp_len_list_0817.tsv'\n",
    "new_seg_edited_file_path = '../../data/Mapping_Pipeline/sifts_files/uniprot_segments_observed_edited_new.tsv' # Has Not exit\n",
    "add_unpLen_segInfo_file_path = '../../data/Mapping_Pipeline/sifts_files/???'\n",
    "\n",
    "demo_pdb_list = pd.read_csv(demo_pdb_list_path, sep='\\t', usecols=['pdb_id'])['pdb_id']\n",
    "old_pdb_list = pd.read_csv(add_InDe_sifts_file_path, sep='\\t', usecols=['pdb_id'])['pdb_id']\n",
    "\n",
    "new_pdb_list = list(set(demo_pdb_list) - set(old_pdb_list))\n",
    "\n",
    "sifts_demo = SIFTS_unit()\n",
    "sifts_demo.set_lists(new_pdb_list, [])\n",
    "\n",
    "fail_list = sifts_demo.get_raw_SIFTS(raw_sifts_file_path)\n",
    "sifts_df_1 = sifts_demo.handle_SIFTS()\n",
    "sifts_df_2 = sifts_demo.deal_with_insertionDeletion_SIFTS(sifts_df=sifts_df_1)\n",
    "\n",
    "sifts_df_1.to_csv(add_rangeInfo_sifts_file_path, sep='\\t', header=False, index=False, mode='a+')\n",
    "sifts_df_2.to_csv(add_InDe_sifts_file_path, sep='\\t', header=False, index=False, mode='a+')\n",
    "\n",
    "# Update(add) UniProt Length Info\n",
    "unp_list = sifts_df_2.apply(lambda x: x['UniProt'].split('-')[0], axis=1).drop_duplicates().tolist()\n",
    "uniprot_demo = UniProt_unit()\n",
    "uniprot_demo.get_info_from_uniprot(['id', 'length'], unp_len_file_path, unp_list=unp_list)\n",
    "unp_len_df = pd.read_csv(unp_len_file_path, sep='\\t', usecols=['Entry', 'Length'])\n",
    "sifts_df_3 = sifts_demo.add_unp_len_SIFTS(sifts_df=sifts_df_2, unpLen_df=unp_len_df)\n",
    "\n",
    "# Update(add) Segment info\n",
    "seg_df = sifts_demo.get_seg_info_from_uniprot_segments_file(outputPath=new_seg_edited_file_path) # Download a new file\n",
    "sifts_df_4 = sifts_demo.add_seg_info_to_SIFTS(sifts_df=sifts_df_3, seg_df=seg_df)\n",
    "\n",
    "sifts_df_4.to_csv(add_unpLen_segInfo_file_path, sep='\\t', mode='a+', index=False, header=False)\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
